{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import os\n",
    "import pytesseract as pt\n",
    "from PIL import Image\n",
    "import re\n",
    "import time\n",
    "import enchant\n",
    "\n",
    "\"\"\"提取图片文字\"\"\"\n",
    "\n",
    "# im=Image.open(r'D:\\test\\电子课本初中英语人教版英语八年级上册\\127.jpg')\n",
    "# #print(pt.image_to_string(im,lang='eng'))\n",
    "# word_test = pt.image_to_string(im,lang='eng')\n",
    "# #for line in word_test.splitlines():#print(line)\n",
    "# #result = ''.join(re.findall(r'[A-Za-z]', word_test))\n",
    "# #print(result)\n",
    "# end = time.time()\n",
    "# print(\"运行时间：\",end - start)\n",
    "from pathlib import Path\n",
    "#os.getcwd()\n",
    "path = Path(\"C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp\\\\data\\\\book\")\n",
    "#os.listdir(path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"\"\"提取正确单词\"\"\"\n",
    "\n",
    "# words = line.split()\n",
    "# #dic = enchant.Dict(\"en_US\")\n",
    "# all_words = []\n",
    "# for word in words:\n",
    "#     if len(word)>=3 :\n",
    "#         #print(word.lower())\n",
    "#         all_words.append(word.lower())\n",
    "\n",
    "\n",
    "# print(\"结果：\",all_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "high = {\"Ren_1\":[78,84],\"Ren_2\":[96,101],\"Ren_3\":[61,71],\"Ren_4\":[79,89],\"Ren_5\":[99,105],\"Ren_6\":[99,105],\"Ren_7\":[106,112],\"Ren_8\":[107,114],\"Ren_9\":[119,125],\"Ren_10\":[117,122],\n",
    "         \"Wai_1\":[58,64],\"Wai_2\":[56,64],\"Wai_3\":[3,6],\"Wai_6\":[75,80]}\n",
    "\"\"\"\n",
    "high = {\"Ren_1\":[78,84],\"Ren_2\":[96,101],\"Ren_3\":[61,71],\"Ren_4\":[79,89],\"Ren_5\":[99,105],\"Ren_6\":[99,105],\"Ren_7\":[106,112],\"Ren_8\":[107,114],\"Ren_9\":[119,125],\"Ren_10\":[117,122],\n",
    "        \"Wai_1\":[58,64],\"Wai_2\":[56,64],\"Wai_3\":[3,6],\"Wai_6\":[75,80]}\n",
    "# m = high.get(\"Ren_1\")\n",
    "# m[0]\n",
    "\n",
    "middle = {'Ai_7_A':[149,157],'Ai_8_A':[138,145],'Ai_8_B':[148,155],'Ai_9_A':[136,144],'Ai_9_B':[136,144],'Ren_7_A':[142,151],'Ren_8_A':[138,148],'Ren_8_B':[0,0],'Wai_7_A':[107,115],'Wai_7_B':[118,124],'Wai_8_A':[142,147],'Wai_8_B':[125,130],'Wai_9_A':[151,157],'Wai_9_B':[94,98]}\n",
    "#middle.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\149.jpg\n",
      "spending_time: -13464.513195753098\n",
      "当前 52 个单词\n",
      "共计 52 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\150.jpg\n",
      "spending_time: -13469.17425942421\n",
      "当前 68 个单词\n",
      "共计 120 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\151.jpg\n",
      "spending_time: -13474.162222146988\n",
      "当前 68 个单词\n",
      "共计 188 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\152.jpg\n",
      "spending_time: -13478.618109226227\n",
      "当前 70 个单词\n",
      "共计 258 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\153.jpg\n",
      "spending_time: -13483.271763324738\n",
      "当前 72 个单词\n",
      "共计 330 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\154.jpg\n",
      "spending_time: -13488.03487086296\n",
      "当前 66 个单词\n",
      "共计 396 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\155.jpg\n",
      "spending_time: -13492.606048583984\n",
      "当前 69 个单词\n",
      "共计 465 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\156.jpg\n",
      "spending_time: -13497.340684175491\n",
      "当前 86 个单词\n",
      "共计 551 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_7_A\\157.jpg\n",
      "spending_time: -13499.959990501404\n",
      "当前 46 个单词\n",
      "共计 597 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\138.jpg\n",
      "spending_time: -13505.581961393356\n",
      "当前 63 个单词\n",
      "共计 660 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\139.jpg\n",
      "spending_time: -13511.676236629486\n",
      "当前 74 个单词\n",
      "共计 734 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\140.jpg\n",
      "spending_time: -13517.875795125961\n",
      "当前 67 个单词\n",
      "共计 801 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\141.jpg\n",
      "spending_time: -13523.910050153732\n",
      "当前 69 个单词\n",
      "共计 870 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\142.jpg\n",
      "spending_time: -13529.789454221725\n",
      "当前 71 个单词\n",
      "共计 941 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\143.jpg\n",
      "spending_time: -13536.267663478851\n",
      "当前 62 个单词\n",
      "共计 1003 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\144.jpg\n",
      "spending_time: -13542.643804311752\n",
      "当前 68 个单词\n",
      "共计 1071 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_A\\145.jpg\n",
      "spending_time: -13548.246319293976\n",
      "当前 71 个单词\n",
      "共计 1142 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\148.jpg\n",
      "spending_time: -13554.519407987595\n",
      "当前 78 个单词\n",
      "共计 1220 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\149.jpg\n",
      "spending_time: -13562.161168813705\n",
      "当前 92 个单词\n",
      "共计 1312 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\150.jpg\n",
      "spending_time: -13569.822042226791\n",
      "当前 87 个单词\n",
      "共计 1399 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\151.jpg\n",
      "spending_time: -13576.520459651947\n",
      "当前 84 个单词\n",
      "共计 1483 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\152.jpg\n",
      "spending_time: -13583.475625514984\n",
      "当前 90 个单词\n",
      "共计 1573 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\153.jpg\n",
      "spending_time: -13590.823880910873\n",
      "当前 84 个单词\n",
      "共计 1657 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\154.jpg\n",
      "spending_time: -13597.929473400116\n",
      "当前 107 个单词\n",
      "共计 1764 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_8_B\\155.jpg\n",
      "spending_time: -13603.23106598854\n",
      "当前 68 个单词\n",
      "共计 1832 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\136.jpg\n",
      "spending_time: -13608.070314884186\n",
      "当前 51 个单词\n",
      "共计 1883 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\137.jpg\n",
      "spending_time: -13613.848683834076\n",
      "当前 63 个单词\n",
      "共计 1946 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\138.jpg\n",
      "spending_time: -13620.297372102737\n",
      "当前 54 个单词\n",
      "共计 2000 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\139.jpg\n",
      "spending_time: -13625.939027786255\n",
      "当前 68 个单词\n",
      "共计 2068 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\140.jpg\n",
      "spending_time: -13632.049129962921\n",
      "当前 59 个单词\n",
      "共计 2127 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\141.jpg\n",
      "spending_time: -13637.604595184326\n",
      "当前 66 个单词\n",
      "共计 2193 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\142.jpg\n",
      "spending_time: -13644.010001897812\n",
      "当前 61 个单词\n",
      "共计 2254 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\143.jpg\n",
      "spending_time: -13650.75248169899\n",
      "当前 75 个单词\n",
      "共计 2329 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_A\\144.jpg\n",
      "spending_time: -13656.172738552094\n",
      "当前 66 个单词\n",
      "共计 2395 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\136.jpg\n",
      "spending_time: -13661.074229717255\n",
      "当前 51 个单词\n",
      "共计 2446 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\137.jpg\n",
      "spending_time: -13666.876457214355\n",
      "当前 63 个单词\n",
      "共计 2509 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\138.jpg\n",
      "spending_time: -13673.36409497261\n",
      "当前 54 个单词\n",
      "共计 2563 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\139.jpg\n",
      "spending_time: -13679.030692100525\n",
      "当前 68 个单词\n",
      "共计 2631 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\140.jpg\n",
      "spending_time: -13685.112521409988\n",
      "当前 59 个单词\n",
      "共计 2690 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\141.jpg\n",
      "spending_time: -13690.707123756409\n",
      "当前 66 个单词\n",
      "共计 2756 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\142.jpg\n",
      "spending_time: -13697.132889509201\n",
      "当前 61 个单词\n",
      "共计 2817 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\143.jpg\n",
      "spending_time: -13703.819556951523\n",
      "当前 75 个单词\n",
      "共计 2892 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ai_9_B\\144.jpg\n",
      "spending_time: -13709.176576137543\n",
      "当前 66 个单词\n",
      "共计 2958 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\142.jpg\n",
      "spending_time: -13716.161101818085\n",
      "当前 75 个单词\n",
      "共计 3033 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\143.jpg\n",
      "spending_time: -13722.666283845901\n",
      "当前 87 个单词\n",
      "共计 3120 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\144.jpg\n",
      "spending_time: -13729.16823720932\n",
      "当前 98 个单词\n",
      "共计 3218 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\145.jpg\n",
      "spending_time: -13735.549175262451\n",
      "当前 95 个单词\n",
      "共计 3313 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\146.jpg\n",
      "spending_time: -13742.687650442123\n",
      "当前 94 个单词\n",
      "共计 3407 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\147.jpg\n",
      "spending_time: -13749.066664934158\n",
      "当前 84 个单词\n",
      "共计 3491 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\148.jpg\n",
      "spending_time: -13755.636038303375\n",
      "当前 90 个单词\n",
      "共计 3581 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\149.jpg\n",
      "spending_time: -13763.004401445389\n",
      "当前 94 个单词\n",
      "共计 3675 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\150.jpg\n",
      "spending_time: -13769.686198949814\n",
      "当前 95 个单词\n",
      "共计 3770 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_7_A\\151.jpg\n",
      "spending_time: -13772.623005390167\n",
      "当前 32 个单词\n",
      "共计 3802 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\138.jpg\n",
      "spending_time: -13781.182984352112\n",
      "当前 63 个单词\n",
      "共计 3865 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\139.jpg\n",
      "spending_time: -13790.227341413498\n",
      "当前 74 个单词\n",
      "共计 3939 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\140.jpg\n",
      "spending_time: -13798.541671037674\n",
      "当前 70 个单词\n",
      "共计 4009 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\141.jpg\n",
      "spending_time: -13806.364505052567\n",
      "当前 93 个单词\n",
      "共计 4102 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\142.jpg\n",
      "spending_time: -13813.481134176254\n",
      "当前 78 个单词\n",
      "共计 4180 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\143.jpg\n",
      "spending_time: -13821.290307760239\n",
      "当前 81 个单词\n",
      "共计 4261 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\144.jpg\n",
      "spending_time: -13829.118739366531\n",
      "当前 73 个单词\n",
      "共计 4334 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\145.jpg\n",
      "spending_time: -13837.388370037079\n",
      "当前 84 个单词\n",
      "共计 4418 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\146.jpg\n",
      "spending_time: -13845.804193735123\n",
      "当前 76 个单词\n",
      "共计 4494 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\147.jpg\n",
      "spending_time: -13849.378144025803\n",
      "当前 36 个单词\n",
      "共计 4530 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_A\\148.jpg\n",
      "spending_time: -13853.039875268936\n",
      "当前 111 个单词\n",
      "共计 4641 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Ren_8_B\\0.jpg\n",
      "spending_time: -13853.341247081757\n",
      "当前 0 个单词\n",
      "共计 4641 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\107.jpg\n",
      "spending_time: -13857.717028141022\n",
      "当前 54 个单词\n",
      "共计 4695 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\108.jpg\n",
      "spending_time: -13863.103332996368\n",
      "当前 84 个单词\n",
      "共计 4779 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\109.jpg\n",
      "spending_time: -13868.029534578323\n",
      "当前 55 个单词\n",
      "共计 4834 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\110.jpg\n",
      "spending_time: -13873.025834560394\n",
      "当前 67 个单词\n",
      "共计 4901 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\111.jpg\n",
      "spending_time: -13877.902372837067\n",
      "当前 52 个单词\n",
      "共计 4953 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\112.jpg\n",
      "spending_time: -13883.081752300262\n",
      "当前 57 个单词\n",
      "共计 5010 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\113.jpg\n",
      "spending_time: -13888.270652294159\n",
      "当前 67 个单词\n",
      "共计 5077 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\114.jpg\n",
      "spending_time: -13892.856353282928\n",
      "当前 52 个单词\n",
      "共计 5129 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_A\\115.jpg\n",
      "spending_time: -13896.859423160553\n",
      "当前 62 个单词\n",
      "共计 5191 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\118.jpg\n",
      "spending_time: -13905.121941328049\n",
      "当前 91 个单词\n",
      "共计 5282 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\119.jpg\n",
      "spending_time: -13913.933528661728\n",
      "当前 102 个单词\n",
      "共计 5384 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\120.jpg\n",
      "spending_time: -13923.805186033249\n",
      "当前 86 个单词\n",
      "共计 5470 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\121.jpg\n",
      "spending_time: -13932.644939422607\n",
      "当前 110 个单词\n",
      "共计 5580 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\122.jpg\n",
      "spending_time: -13940.813114881516\n",
      "当前 92 个单词\n",
      "共计 5672 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\123.jpg\n",
      "spending_time: -13950.38477563858\n",
      "当前 107 个单词\n",
      "共计 5779 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_7_B\\124.jpg\n",
      "spending_time: -13958.908766746521\n",
      "当前 105 个单词\n",
      "共计 5884 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_A\\142.jpg\n",
      "spending_time: -13963.410431623459\n",
      "当前 33 个单词\n",
      "共计 5917 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_A\\143.jpg\n",
      "spending_time: -13968.177033662796\n",
      "当前 45 个单词\n",
      "共计 5962 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_A\\144.jpg\n",
      "spending_time: -13972.337088346481\n",
      "当前 54 个单词\n",
      "共计 6016 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_A\\145.jpg\n",
      "spending_time: -13977.181647539139\n",
      "当前 68 个单词\n",
      "共计 6084 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_A\\146.jpg\n",
      "spending_time: -13982.306263685226\n",
      "当前 63 个单词\n",
      "共计 6147 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_A\\147.jpg\n",
      "spending_time: -13987.78738117218\n",
      "当前 54 个单词\n",
      "共计 6201 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_B\\125.jpg\n",
      "spending_time: -13995.471985578537\n",
      "当前 82 个单词\n",
      "共计 6283 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_B\\126.jpg\n",
      "spending_time: -14003.343252420425\n",
      "当前 70 个单词\n",
      "共计 6353 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_B\\127.jpg\n",
      "spending_time: -14011.01607465744\n",
      "当前 75 个单词\n",
      "共计 6428 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_B\\128.jpg\n",
      "spending_time: -14018.96264743805\n",
      "当前 69 个单词\n",
      "共计 6497 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_B\\129.jpg\n",
      "spending_time: -14027.352599859238\n",
      "当前 106 个单词\n",
      "共计 6603 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_8_B\\130.jpg\n",
      "spending_time: -14031.504673719406\n",
      "当前 44 个单词\n",
      "共计 6647 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\151.jpg\n",
      "spending_time: -14036.131997823715\n",
      "当前 49 个单词\n",
      "共计 6696 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\152.jpg\n",
      "spending_time: -14040.957650899887\n",
      "当前 54 个单词\n",
      "共计 6750 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\153.jpg\n",
      "spending_time: -14045.62605047226\n",
      "当前 45 个单词\n",
      "共计 6795 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\154.jpg\n",
      "spending_time: -14050.641218185425\n",
      "当前 67 个单词\n",
      "共计 6862 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\155.jpg\n",
      "spending_time: -14055.667287349701\n",
      "当前 73 个单词\n",
      "共计 6935 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\156.jpg\n",
      "spending_time: -14060.591763019562\n",
      "当前 55 个单词\n",
      "共计 6990 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_A\\157.jpg\n",
      "spending_time: -14062.811889410019\n",
      "当前 19 个单词\n",
      "共计 7009 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_B\\94.jpg\n",
      "spending_time: -14069.007991075516\n",
      "当前 64 个单词\n",
      "共计 7073 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_B\\95.jpg\n",
      "spending_time: -14075.280796051025\n",
      "当前 81 个单词\n",
      "共计 7154 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_B\\96.jpg\n",
      "spending_time: -14080.539909362793\n",
      "当前 87 个单词\n",
      "共计 7241 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_B\\97.jpg\n",
      "spending_time: -14084.201699495316\n",
      "当前 28 个单词\n",
      "共计 7269 个单词\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\middle\\Wai_9_B\\98.jpg\n",
      "spending_time: -14089.925464630127\n",
      "当前 64 个单词\n",
      "共计 7333 个单词\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp'"
      ]
     },
     "execution_count": 172,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#处理初中的文本的路径\n",
    "path = path/\"middle\"\n",
    "new_files = []\n",
    "for file in os.listdir(path):\n",
    "    tmp = middle[file]\n",
    "    #print(tmp[0],tmp[1],type(tmp))\n",
    "    new_path = os.path.join(path, file)\n",
    "    #print(file)\n",
    "    for f in os.listdir(new_path):\n",
    "        #print(f)\n",
    "        if tmp[0] <= int(f.rstrip(\".jpg\")) <= tmp[1]:\n",
    "            path = os.path.join(new_path,f)\n",
    "            #print(f)\n",
    "            new_files += [path]#得到所有处理文件的路径\n",
    "            path = Path(\"C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp\\\\data\\\\book\\\\middle\\\\\")\n",
    "#print(new_files)\n",
    "\n",
    "middle_word_list = []\n",
    "\"\"\"获取文字\"\"\"\n",
    "for jpg_path in new_files:\n",
    "    print(jpg_path)\n",
    "#---------------------------ocr-----------------------------------\n",
    "    pt.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n",
    "    all_words = []\n",
    "    start = time.time()\n",
    "    im = Image.open(os.path.abspath(jpg_path))\n",
    "    raw_word = pt.image_to_string(im,lang = 'eng')\n",
    "    words = re.sub(r\"[^A-Za-z\\s]+\", \"\", raw_word.strip())#正则提取所有带噪声的单词\n",
    "    dic = enchant.Dict(\"en_US\")#创建字典\n",
    "    for word in words.split():\n",
    "        if len(word)>=3 and dic.check(word.lower()):\n",
    "            all_words.append(word.lower())#去噪\n",
    "    start = time.time()\n",
    "    middle_word_list += all_words\n",
    "    print(\"spending_time:\",end - start)\n",
    "    print(\"当前\",len(all_words),\"个单词\")\n",
    "    print(\"共计\",len(middle_word_list),\"个单词\")\n",
    "#-------------------------------------------------------------------\n",
    "    path = Path(\"C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp\\\\data\\\\book\")#init path\n",
    "\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [],
   "source": [
    "#middle_word_list存储\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "pf = pd.DataFrame(data=middle_word_list)\n",
    "pf.to_csv(\"./data/book/middle_word_list.csv\", encoding=\"utf-8\", header=False, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tmp = 0\n",
    "# t = 0\n",
    "# for key in middle.keys():\n",
    "#     t = middle[key][1] - middle[key][0]\n",
    "#     tmp += t+1\n",
    "# print(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\78.jpg\n",
      "spending_time: -15339.369122743607\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\79.jpg\n",
      "spending_time: -15349.434293985367\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\80.jpg\n",
      "spending_time: -15359.583738327026\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\81.jpg\n",
      "spending_time: -15368.196361541748\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\82.jpg\n",
      "spending_time: -15379.291596651077\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\83.jpg\n",
      "spending_time: -15389.398183345795\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_1\\84.jpg\n",
      "spending_time: -15394.817720651627\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_10\\117.jpg\n",
      "spending_time: -15404.266087532043\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_10\\118.jpg\n",
      "spending_time: -15415.297760248184\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_10\\119.jpg\n",
      "spending_time: -15425.78970360756\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_10\\120.jpg\n",
      "spending_time: -15435.585498094559\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_10\\121.jpg\n",
      "spending_time: -15445.896906137466\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_10\\122.jpg\n",
      "spending_time: -15453.119769334793\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_2\\100.jpg\n",
      "spending_time: -15457.971754312515\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_2\\101.jpg\n",
      "spending_time: -15461.732523441315\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_2\\96.jpg\n",
      "spending_time: -15466.3437666893\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_2\\97.jpg\n",
      "spending_time: -15471.42491889\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_2\\98.jpg\n",
      "spending_time: -15476.526772737503\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_2\\99.jpg\n",
      "spending_time: -15481.653309822083\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\61.jpg\n",
      "spending_time: -15486.134802818298\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\62.jpg\n",
      "spending_time: -15490.77373623848\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\63.jpg\n",
      "spending_time: -15492.905389785767\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\64.jpg\n",
      "spending_time: -15495.551629304886\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\65.jpg\n",
      "spending_time: -15501.097596406937\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\66.jpg\n",
      "spending_time: -15504.539073944092\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\67.jpg\n",
      "spending_time: -15507.364074707031\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\68.jpg\n",
      "spending_time: -15511.844863176346\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\69.jpg\n",
      "spending_time: -15515.234978675842\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\70.jpg\n",
      "spending_time: -15517.630083799362\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_3\\71.jpg\n",
      "spending_time: -15520.2331097126\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\79.jpg\n",
      "spending_time: -15526.272839784622\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\80.jpg\n",
      "spending_time: -15531.884182453156\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\81.jpg\n",
      "spending_time: -15538.212777853012\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\82.jpg\n",
      "spending_time: -15544.188354969025\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\83.jpg\n",
      "spending_time: -15549.980967760086\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\84.jpg\n",
      "spending_time: -15556.236934185028\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\85.jpg\n",
      "spending_time: -15561.32389330864\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\86.jpg\n",
      "spending_time: -15567.228360652924\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\87.jpg\n",
      "spending_time: -15572.92380285263\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\88.jpg\n",
      "spending_time: -15579.14304947853\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_4\\89.jpg\n",
      "spending_time: -15581.227678537369\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\100.jpg\n",
      "spending_time: -15587.181280374527\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\101.jpg\n",
      "spending_time: -15592.700912714005\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\102.jpg\n",
      "spending_time: -15598.076518535614\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\103.jpg\n",
      "spending_time: -15604.081052303314\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\104.jpg\n",
      "spending_time: -15610.267772197723\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\105.jpg\n",
      "spending_time: -15611.601044893265\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_5\\99.jpg\n",
      "spending_time: -15617.336471796036\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\100.jpg\n",
      "spending_time: -15627.476733446121\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\101.jpg\n",
      "spending_time: -15637.738995313644\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\102.jpg\n",
      "spending_time: -15646.542377710342\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\103.jpg\n",
      "spending_time: -15656.355383396149\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\104.jpg\n",
      "spending_time: -15666.664938688278\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\105.jpg\n",
      "spending_time: -15672.21829032898\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_6\\99.jpg\n",
      "spending_time: -15681.934109926224\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\106.jpg\n",
      "spending_time: -15692.032712221146\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\107.jpg\n",
      "spending_time: -15702.362325191498\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\108.jpg\n",
      "spending_time: -15712.229922533035\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\109.jpg\n",
      "spending_time: -15721.68942642212\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\110.jpg\n",
      "spending_time: -15731.795525550842\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\111.jpg\n",
      "spending_time: -15741.346809387207\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_7\\112.jpg\n",
      "spending_time: -15745.692351579666\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\107.jpg\n",
      "spending_time: -15754.822616100311\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\108.jpg\n",
      "spending_time: -15764.933977127075\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\109.jpg\n",
      "spending_time: -15774.384259223938\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\110.jpg\n",
      "spending_time: -15783.785479784012\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\111.jpg\n",
      "spending_time: -15792.624977111816\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\112.jpg\n",
      "spending_time: -15801.149938821793\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\113.jpg\n",
      "spending_time: -15811.2026617527\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_8\\114.jpg\n",
      "spending_time: -15817.885289669037\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\119.jpg\n",
      "spending_time: -15826.502970933914\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\120.jpg\n",
      "spending_time: -15835.817973136902\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\121.jpg\n",
      "spending_time: -15845.079138755798\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\122.jpg\n",
      "spending_time: -15854.59240436554\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\123.jpg\n",
      "spending_time: -15864.807852983475\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\124.jpg\n",
      "spending_time: -15874.353905439377\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Ren_9\\125.jpg\n",
      "spending_time: -15875.569494962692\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\58.jpg\n",
      "spending_time: -15880.341353178024\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\59.jpg\n",
      "spending_time: -15885.79938006401\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\60.jpg\n",
      "spending_time: -15892.092293977737\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\61.jpg\n",
      "spending_time: -15896.109550714493\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\62.jpg\n",
      "spending_time: -15900.423306703568\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\63.jpg\n",
      "spending_time: -15904.300677537918\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_1\\64.jpg\n",
      "spending_time: -15909.433105707169\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\56.jpg\n",
      "spending_time: -15913.537339687347\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\57.jpg\n",
      "spending_time: -15918.262262821198\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\58.jpg\n",
      "spending_time: -15923.219396829605\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\59.jpg\n",
      "spending_time: -15926.812590837479\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\60.jpg\n",
      "spending_time: -15931.00171160698\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\61.jpg\n",
      "spending_time: -15935.272061109543\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\62.jpg\n",
      "spending_time: -15939.381357431412\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\63.jpg\n",
      "spending_time: -15943.044656515121\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_2\\64.jpg\n",
      "spending_time: -15948.070742845535\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_3\\3.jpg\n",
      "spending_time: -15951.075584411621\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_3\\4.jpg\n",
      "spending_time: -15957.057233095169\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_3\\5.jpg\n",
      "spending_time: -15962.99279141426\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_3\\6.jpg\n",
      "spending_time: -15966.598585128784\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_6\\75.jpg\n",
      "spending_time: -15971.336128473282\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_6\\76.jpg\n",
      "spending_time: -15979.640299320221\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_6\\77.jpg\n",
      "spending_time: -15985.94768166542\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_6\\78.jpg\n",
      "spending_time: -15990.664895296097\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_6\\79.jpg\n",
      "spending_time: -15997.946752548218\n",
      "C:\\Users\\54554\\Desktop\\Project\\race-nlp\\data\\book\\high\\Wai_6\\80.jpg\n",
      "spending_time: -16006.905915021896\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp'"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#处理初中的文本的路径\n",
    "path = Path(\"C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp\\\\data\\\\book\")\n",
    "path = path/\"high\"\n",
    "new_files = []\n",
    "for file in os.listdir(path):\n",
    "    tmp = high[file]\n",
    "    #print(tmp[0],tmp[1],type(tmp))\n",
    "    new_path = os.path.join(path, file)\n",
    "    #print(file)\n",
    "    for f in os.listdir(new_path):\n",
    "        #print(f)\n",
    "        if tmp[0] <= int(f.rstrip(\".jpg\")) <= tmp[1]:\n",
    "            path = os.path.join(new_path,f)\n",
    "            #print(f)\n",
    "            new_files += [path]#得到所有处理文件的路径\n",
    "            path = Path(\"C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp\\\\data\\\\book\\\\high\\\\\")\n",
    "#print(new_files)\n",
    "\n",
    "middle_word_list = []\n",
    "\"\"\"获取文字\"\"\"\n",
    "for jpg_path in new_files:\n",
    "    print(jpg_path)\n",
    "#---------------------------ocr-----------------------------------\n",
    "    pt.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n",
    "    all_words = []\n",
    "    start = time.time()\n",
    "    im = Image.open(os.path.abspath(jpg_path))\n",
    "    raw_word = pt.image_to_string(im,lang = 'eng')\n",
    "    words = re.sub(r\"[^A-Za-z\\s]+\", \"\", raw_word.strip())#正则提取所有带噪声的单词\n",
    "    dic = enchant.Dict(\"en_US\")#创建字典\n",
    "    for word in words.split():\n",
    "        if len(word)>=3 and dic.check(word.lower()):\n",
    "            all_words.append(word.lower())#去噪\n",
    "    start = time.time()\n",
    "    middle_word_list += all_words\n",
    "    print(\"spending_time:\",end - start)\n",
    "    if os.path.isdir(jpg_path):\n",
    "        print(\"当前\",len(all_words),\"个单词\")\n",
    "        print(\"共计\",len(middle_word_list),\"个单词\")\n",
    "#-------------------------------------------------------------------\n",
    "    path = Path(\"C:\\\\Users\\\\54554\\\\Desktop\\\\Project\\\\race-nlp\\\\data\\\\book\")#init path\n",
    "\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4127"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(middle_word_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pf = pd.DataFrame(data=middle_word_list)\n",
    "pf.to_csv(\"./data/book/high_word_list.csv\", encoding=\"utf-8\", header=False, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
